/* 2-way FE estimation */

#delimit ;

clear all;

local outfile "GWgap_2wFE_extract_v4";
set more off;

di _n "$S_DATE $S_TIME";











*1******************************************************************************;
* making data sets to run 2-way FE model on and running model;
*******************************************************************************;

foreach g in m f t {; * limit age range for full sample to make it run;
	
	di _n "Loading data for gender `g': $S_TIME";
			
	if "`g'"=="m" | "`g'"=="f" use GWgap_2wFE_extract_pre_2wFE_`g', clear;
	local age_range "all ages";
	if "`g'"=="t" {;
		use GWgap_2wFE_extract_pre_2wFE_m, clear;
		append using GWgap_2wFE_extract_pre_2wFE_f;
		
		keep if mean_age>=20 & mean_age<65;
		local age_range "ages 20-64";
	};
	
	
	* deflating earnings and creating wage variable of interest;
	
	gen year = fin_yr;
	
	merge m:1 year using CPI, keep(master match) nogen;	
	drop year;

	gen rwage = nom_wage*deflator;

	gen rwage_pfte = 12*rwage/sum_fte;
	label var rwage_pfte "Real annual wage per FTE (2006 $)";

	gen lnwage = ln(rwage_pfte);
	label var lnwage "Real annual wage per FTE (ln 2006 $)";
	
	
	* creating dummies for financial year for regs;
	
	forvalues year = 2002/2018 {; * looping over financial years;

		gen byte fin_yr`year' = fin_yr==`year';
	};
	drop fin_yr;
	
	rename mean_age age;
	gen age2 = age^2/100;
	gen age3 = age^3/1000;
	
	gen lnfte = ln(sum_fte/120);
	label var lnfte "FTEs for the year at pent (ln)";
		
	keep snz_uid_gp pent_gp fin_yr* age* lnwage lnfte; 
	
	
	* dropping observations with missing data;
	
	#delimit ;
	foreach var of varlist lnwage lnfte age age2 age3 fin_yr* {;
		drop if `var'==.;
	};

	
	* keeping largest connected set only;
	
	a2group, individual(snz_uid_gp) unit(pent_gp) groupvar(new);
	keep if new==1;
	drop new;
	
	
	compress;
	
	
	* running 2-way FE regs;

	#delimit ;
	di _n "Starting a2reg $S_TIME";
	a2reg lnwage lnfte age age2 age3 fin_yr*, individual(snz_uid_gp) unit(pent_gp)
		indeffect(wfe) uniteffect(ffe);
	di _n "Finishing a2reg $S_TIME";

	notes: `outfile'_post_2wFE_`g' is estimated for the `g' population of `age_range';
	save `outfile'_post_2wFE_`g', replace;
			
};	






*2******************************************************************************;
* identifying largest firm (by months of male employment);
*******************************************************************************;

use GWgap_2wFE_extract_pre_2wFE_m, clear;

keep pent;

gen i = 1;

collapse (count) obs = i, by(pent);

gsort - obs;

keep if _n==1;

keep pent;

tab pent;

save temp_largest_pent, replace;






*3******************************************************************************;
* creating files for men and women that map snz_uid and pent to wfe and ffe;
*******************************************************************************;

foreach g in m f t {; 

	if "`g'"=="m" local gender males;
	if "`g'"=="f" local gender females;
	local age_range "all ages";
	if "`g'"=="t" {;
		local gender "males and females";
		local age_range "ages 20-64";
	};

	use `outfile'_post_2wFE_`g', clear;
	
	keep pent_gp snz_uid_gp wfe ffe;
	duplicates drop;
	
	if "`g'"=="m" | "`g'"=="f" {;
		merge 1:m pent_gp snz_uid_gp using GWgap_2wFE_extract_pre_2wFE_`g', keepus(pent snz_uid) 
			keep(match) nogen;
		save temp_`g', replace;
	};
	if "`g'"=="t" {;
		preserve;
		merge 1:m pent_gp snz_uid_gp using GWgap_2wFE_extract_pre_2wFE_m, keepus(pent snz_uid) 
			keep(match) nogen;
		save temp_t_m, replace;
			
		restore;
		merge 1:m pent_gp snz_uid_gp using GWgap_2wFE_extract_pre_2wFE_f, keepus(pent snz_uid) 
			keep(match) nogen update;
			
		append using temp_t_m;
		save temp_t, replace;
	};
};


* saving worker FE;

foreach g in m f t {; 

	if "`g'"=="m" local gender males;
	if "`g'"=="f" local gender females;
	local age_range "all ages";
	if "`g'"=="t" {;
		local gender "males and females";
		local age_range "ages 20-64";
	};

	use temp_`g', clear;
	
	keep snz_uid wfe;
	duplicates drop;
	
	if "`g'"=="m" | "`g'"=="f" {;
		label var wfe "Worker fixed effect";
		notes wfe: Normalisation is such that sum of wfe for each gender is equal to 0.;
	};
	if "`g'"=="t" {;
		rename wfe wfe_t;
		label var wfe_t "Worker fixed effect from pooled genders";
		notes wfe_t: Normalisation is such that sum of wfe for both genders pooled is equal to 0.;
	};
	
	notes: `outfile'_wfe_`g'.dta was created by `outfile'.do on $S_DATE at $S_TIME. An observation 
		is an snz_uid. The sample is `gender' of `age_range' connected to the largest set of workers/firms.;
	save `outfile'_wfe_`g', replace;
	
	* saving firm FE;
	
	use temp_`g', clear;
	keep pent ffe;
	duplicates drop;
	
	merge 1:1 pent using temp_largest_pent;
	gen norm = ffe if _m==3;
	sort norm;
	replace norm = norm[_n-1] if norm==.;
	replace ffe = ffe - norm;
	
	if "`g'"=="m" | "`g'"=="f" {;
		label var ffe "Firm fixed effect";
		notes ffe: Normalisation is the largest pent (for men) has FE of zero for both genders.;
	};
	if "`g'"=="t" {;
		rename ffe ffe_t;
		label var ffe_t "Firm fixed effect from pooled genders";
		notes ffe_t: Normalisation is the largest pent (for men) has FE of zero.;
	};
	
	notes: `outfile'_ffe_`g'.dta was created by `outfile'.do on $S_DATE at $S_TIME. An observation 
		is a pent. The sample is `gender' of `age_range' connected to the largest set of workers/firms.;
	save `outfile'_ffe_`g', replace;
};
	
	

